This notebook build a set high-dimensional emulators of timeseries of JULES global mean carbon cycle outputs. It uses PCA to reduce the dimension of the output, and build a Gaussian Process emulator of the output in reduced dimension.
This notebook builds on the work of explore-jules-timeseries-emulate, and applies the emulator to multiple timeseries.
# Libraries
library(DiceKriging)
library(parallel)
# Helper functions
anomalizeTSmatrix = function(x, ix){
# Anomalise a timeseries matrix
subx = x[ ,ix]
sweepstats = apply(subx, 1, FUN=mean)
anom = sweep(x, 1, sweepstats, FUN = '-')
anom
}
mat2list <- function(X){
# Turns the p columns of a matrix into a p length list,
# with each column becoming an element of the list
out <- vector(mode = 'list', length = ncol(X))
for(i in 1:ncol(X)){
out[[i]] <- X[ , i]
}
out
}
reset <- function() {
# Allows annotation of graphs, resets axes
par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
makeTransparent<-function(someColor, alpha=100)
# Transparent colours for plotting
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
pca_km <- function(X, Y, train_ix, test_ix, npctol = 0.99, scale = FALSE, center = TRUE, ...){
# emulate high dimensional output
require(parallel)
# Split into training and test samples
X_train <- X[train_ix, ]
X_test <- X[test_ix, ]
Y_train <- Y[train_ix, ]
Y_test <- Y[test_ix, ]
#reduce dimension of the output
pca <- prcomp(Y_train, scale = scale, center = center)
# choose a number of pcs
pca_summary <- summary(pca)
# 2 PCs minimum
if(pca_summary$importance[3,2] < npctol){
npc <- as.numeric(which(pca_summary$importance[3,] > npctol)[1])
}
else{
npc <- 2
}
scores_train_list <- mat2list(pca$x[, 1:npc])
# fitting the emulator is a slow step, so we use parallel computation
# Fit an emulator to each principal component in turn
fit_list <- mclapply(X = scores_train_list, FUN = km, formula = ~., design = X_train,
mc.cores = 4, control = list(trace = FALSE, maxit = 200))
scores_em_test_mean <- NULL
scores_em_test_sd <- NULL
scores_em_train_mean <- NULL
scores_em_train_sd <- NULL
for(i in 1:npc){
loo <- leaveOneOut.km(fit_list[[i]], type = 'UK', trend.reestim = TRUE)
pred_test <- predict(fit_list[[i]], newdata = X_test, type = 'UK')
# Predict training data (low dimension representation)
scores_em_train_mean <- cbind(scores_em_train_mean, loo$mean)
scores_em_train_sd <- cbind(scores_em_train_sd, loo$sd)
# Predict test data (low dimension representation)
scores_em_test_mean <- cbind(scores_em_test_mean, pred_test$mean)
scores_em_test_sd <- cbind(scores_em_test_sd, pred_test$sd)
}
# Project back up to high dimension
if(scale){
anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])*pca$scale
anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])*pca$scale
sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc])*pca$scale)
sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc])*pca$scale)
}
else{
anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])
anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])
sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc]))
sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc]))
}
Ypred_train <- t(anom_train + pca$center)
Ypred_test <- t(anom_test + pca$center)
Yerr_train <- Y_train - Ypred_train
Yerr_test <- Y_test - Ypred_test
return(list(X = X,
Y = Y,
train_ix = train_ix,
test_ix = test_ix,
X_train = X_train,
X_test = X_test,
npc = npc,
pca = pca,
fit_list = fit_list,
scores_em_train_mean = scores_em_train_mean,
scores_em_train_sd = scores_em_train_sd,
scores_em_test_mean = scores_em_test_mean,
scores_em_test_sd = scores_em_test_sd,
Ypred_train = Ypred_train,
Ypred_test = Ypred_test,
sd_train = sd_train,
sd_test = sd_test,
Yerr_train = Yerr_train,
Yerr_test = Yerr_test
))
}
pca_km_errorStats <- function(fit, nround = 2, compare_ix = NULL){
# Calculate the error statistics from a pca_km object
# fit ........... output object from pca_km
# nround ........ decimals to round in output
# compare_ix..... indices of the training set to calculate error stats for
if(is.null(compare_ix)){
compare_ix <- 1:nrow(fit$Yerr_train)
}
else{ compare_ix <- compare_ix}
trainMAE <- round(mean(abs(fit$Yerr_train[compare_ix, ])), nround)
testMAE <- round(mean(abs(fit$Yerr_test)), nround)
return(list(trainMAE = trainMAE,
testMAE = testMAE))
}
pca_km_tsSparkPlot <- function(pca_km_obj, nr, nc, transp, maintext, yrs, obscol = 'black', predcol = 'red', ...){
# Timeseries test set prediction sparkline plot (small multiples)
par(mfrow = c(nr,nc), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(1,0.1,5,0.1))
ylim <- range(
(pca_km_obj$Ypred_test + 2*(pca_km_obj$sd_test)),
( pca_km_obj$Ypred_test - 2*(pca_km_obj$sd_test)))
for(i in 1:length(pca_km_obj$test_ix)){
plot(yrs, (pca_km_obj$Y[test_ix, ])[i, ], ylim = ylim, type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(yrs, (pca_km_obj$Y[test_ix, ])[i, ], col = obscol, lwd = 1.5)
lines(yrs, pca_km_obj$Ypred_test[i, ], col = predcol, lwd = 1.5)
lines(yrs, pca_km_obj$Ypred_test[i, ] + 2*(pca_km_obj$sd_test[i, ]), col = makeTransparent(predcol,transp))
lines(yrs, pca_km_obj$Ypred_test[i, ] - 2*(pca_km_obj$sd_test[i, ]), col = makeTransparent(predcol,transp))
}
mtext(maintext, side = 3, outer = TRUE, cex = 1.5, line = 1)
reset()
legend('topleft', col = c(obscol, predcol, makeTransparent(predcol,transp)),
legend = c('observed', 'predicted', '+-2sd'), lty = 'solid', horiz = TRUE, bty = 'n')
}
# A function to identify rows with outlier data (from Google Bard, adapted to update threshold)
findAnomalies <- function(matrix, iqrThres = 1.5, madThres = 1.5, na.rm = FALSE) {
# Check for NA values
if (!na.rm) {
naRows <- which(apply(matrix, 1, is.na))
if (length(naRows) > 0) {
return(naRows)
}
}
# Check for outliers using IQR method
iqr <- IQR(matrix, na.rm = na.rm)
q1 <- quantile(matrix, 0.25, na.rm = na.rm)
q3 <- quantile(matrix, 0.75, na.rm = na.rm)
outlierRows <- which(apply(matrix, 1, function(row) {
any(row < q1 - iqrThres * iqr | row > q3 + iqrThres * iqr)
}))
# Check for discontinuities
discontinuityRows <- which(apply(matrix, 1, function(row) {
any(diff(row) > madThres * mad(row))
}))
# Combine NA, outlier, and discontinuity rows
anomalyRows <- union(union(naRows, outlierRows), discontinuityRows)
# Return index of anomaly rows
return(anomalyRows)
}
findAnomaliesTSLocal <- function(matrix, iqrThres = 5, madThres = 3, na.rm = FALSE) {
# Find outlier runs in an ensemble
# Check for NA values
if (!na.rm) {
naRows <- which(apply(matrix, 1, is.na))
if (length(naRows) > 0) {
return(naRows)
}
}
# find the local (per column) IQR
iqr <- apply(matrix, 2, FUN = IQR, na.rm = na.rm)
q1 <- apply(matrix,2, FUN = quantile, probs = 0.25)
q3 <- apply(matrix,2, FUN = quantile, probs = 0.75)
outlierRows <- which(apply(matrix, 1, function(row) {
any(row < q1 - iqrThres * iqr | row > q3 + iqrThres * iqr)
}))
# Check for discontinuities
discontinuityRows <- which(apply(matrix, 1, function(row) {
any(diff(row) > madThres * mad(row))
}))
# Combine NA, outlier, and discontinuity rows
anomalyRows <- union(union(naRows, outlierRows), discontinuityRows)
# Return index of anomaly rows
return(anomalyRows)
}
# Load data
load('data/ensemble-jules-historical-timeseries.RData')
# Use only the post-1950 years
years_ix <- 101:164
years_trunc <- years[years_ix]
For each input/output set, it would be useful to have some functions which identify ensemble members which have “bad” data. Outliers, or NAs, for example.
cnbp_ens_wave00 <- t(apply(nbp_ens_wave00[ , years_ix], 1, FUN = cumsum))
cnbp_ens_wave01 <- t(apply(nbp_ens_wave01[ , years_ix], 1, FUN = cumsum))
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(cnbp_ens_wave01, iqrThres = 5, madThres = 3)
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
Y_combine <- rbind(cnbp_ens_wave00[-kill_ix_wave00,], cnbp_ens_wave01[-kill_ix_wave01, ])
Y_remove <- rbind(cnbp_ens_wave00[kill_ix_wave00,], cnbp_ens_wave01[kill_ix_wave01, ])
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = 'black', main = 'retained runs')
matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid', ylim = c(-100, 100),
col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
main = 'removed runs')
testprop <- 0.2
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km_cnbp_combine <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
plot(pc_km_cnbp_combine$pca)
pca_km_errorStats(pc_km_cnbp_combine)
## $trainMAE
## [1] 5.42
##
## $testMAE
## [1] 7.15
pca_km_tsSparkPlot(pc_km_cnbp_combine, nr = 8, nc = 20, yrs = years_trunc, maintext = 'CNBP test predictions', transp = 100)
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine[train_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',100), lwd = 2,
main = "Train",
ylab = 'CNBP')
matlines(years_trunc, t(pc_km_cnbp_combine$Ypred_train), lty = 'solid', col = makeTransparent('red',100), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
matplot(years_trunc, t(Y_combine[test_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',100), lwd = 2,
main = 'Test',
ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_cnbp_combine$Ypred_test), lty = 'solid', col = makeTransparent('red',100), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(npp_ens_wave01[, years_ix], iqrThres = 5, madThres = 3)
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
# WARNING - if nothing is removed, nothing is combined
Y_combine <- rbind(npp_ens_wave00[-kill_ix_wave00, years_ix], npp_ens_wave01[-kill_ix_wave01, years_ix])
Y_remove <- rbind(npp_ens_wave00[kill_ix_wave00, years_ix], npp_ens_wave01[kill_ix_wave01, years_ix])
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = 'black', main = 'retained runs')
matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid',
col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
main = 'removed runs')
testprop <- 0.2
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km_npp_combine <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
pca_km_tsSparkPlot(pc_km_npp_combine, nr = 8, nc = 20, yrs = years_trunc, maintext = 'NPP test predictions', transp = 100)
npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00[, years_ix], 1:20)
npp_anom_ens_wave01 <- anomalizeTSmatrix(npp_ens_wave01[, years_ix], 1:20)
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(npp_anom_ens_wave01, iqrThres = 5, madThres = 3)
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
# WARNING - if nothing is removed, nothing is combined
Y_combine <- rbind(npp_anom_ens_wave00[-kill_ix_wave00, ], npp_anom_ens_wave01[-kill_ix_wave01, ])
Y_remove <- rbind(npp_anom_ens_wave00[kill_ix_wave00, ], npp_anom_ens_wave01[kill_ix_wave01, ])
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = 'black', main = 'retained runs')
matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid',
col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
main = 'removed runs')
testprop <- 0.2
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km_npp_anom_combine <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
pca_km_tsSparkPlot(pc_km_npp_anom_combine, nr = 8, nc = 20, yrs = years_trunc, maintext = 'NPP anomaly test predictions', transp = 100)
varnames <- c("baresoilFrac_lnd_mean",
"c3PftFrac_lnd_mean",
"c4PftFrac_lnd_mean",
#"cnbp",
"cSoil",
"cVeg",
"fHarvest_lnd_sum",
"fLuc_lnd_sum",
"lai_lnd_mean",
"nbp",
"npp",
"rh_lnd_sum",
"shrubFrac_lnd_mean",
"treeFrac_lnd_mean")
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
testprop <- 0.2
for(i in 1:length(varnames)){
varname <- varnames[i]
ens_wave00 <- get(paste0(varname, '_ens_wave00'))[, years_ix]
ens_wave01 <- get(paste0(varname, '_ens_wave01'))[ , years_ix]
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(ens_wave01, iqrThres = 5, madThres = 3)
if(identical(kill_ix_wave01, integer(0))){
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train)
Y_combine <- rbind(ens_wave00[-kill_ix_wave00, ], ens_wave01)
Y_remove <- rbind(ens_wave00[kill_ix_wave00, ])
}
else{
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
Y_combine <- rbind(ens_wave00[-kill_ix_wave00, ], ens_wave01[-kill_ix_wave01, ])
Y_remove <- rbind(ens_wave00[kill_ix_wave00, ], ens_wave01[kill_ix_wave01, ])
}
#par(mfrow = c(1,2), oma = c(0.1, 0.1, 5, 0.1))
#matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = 'black', main = 'retained runs')
#matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid',
#col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
#main = 'removed runs')
#mtext(side = 3, text = varname, outer = TRUE, line = 3, cex = 2)
outname <- paste0('pc_km_', varname)
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
assign(outname, pc_km)
pca_km_tsSparkPlot(get(outname), nr = 8, nc = 20, yrs = years_trunc, maintext = varname, transp = 100)
}